### Replication Material ###
### The Manifesto Corpus ### 
### key word lists - Table 2 ### 

library(manifestoR)
library(SnowballC)
library(tables)

# set api key
mp_setapikey(key.file = "manifesto_apikey.txt")

# specify corpus version
mp_use_corpus_version("2016-1")


### FUNCTIONS & CONSTANTS

### code definitions
# all codes
all_codes <- c(101,102,103,104,105,106,107,108,109,110,
               201,202,203,204,
               301,302,303,304,305,
               401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,
               501,502,503,504,505,506,507,
               601,602,603,604,605,606,607,608,
               701,702,703,704,705,706)

### create keyword list for set of categories
create_dictionary <- function(lang, categories, freq) {
  available_docs <- mp_availability(TRUE)
  available_lang_docs <- filter(available_docs$availability, language == lang)
  coddocs <- mp_corpus(available_lang_docs, codefilter = categories)
  library(SnowballC)
  tdm <- TermDocumentMatrix(coddocs, control=list(stopwords = TRUE,
                                                  removeNumbers = TRUE,
                                                  removePunctuation = TRUE,
                                                  toLower = TRUE,
                                                  stemming = TRUE,
                                                  stripWhitespace = TRUE))

  m <- as.matrix(tdm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  
  dict_precleaned <- as.character(d[1:freq,]$word)
  
  anti_codes <- all_codes[! all_codes %in% categories]
  anti_coddocs <- mp_corpus(available_lang_docs, codefilter = anti_codes)
  anti_tdm <- TermDocumentMatrix(anti_coddocs, control = list(stopwords = TRUE,
                                                  removeNumbers = TRUE,
                                                  removePunctuation = TRUE,
                                                  toLower = TRUE,
                                                  stemming = TRUE,
                                                  stripWhitespace = TRUE))
  m_anti <- as.matrix(anti_tdm)
  v_anti <- sort(rowSums(m_anti),decreasing=TRUE)
  d_anti <- data.frame(word = names(v_anti),freq=v_anti)
  
  dict_anti <- as.character(d_anti[1:(freq*2),]$word)
  diction <- as.vector(dict_precleaned[! dict_precleaned %in% dict_anti])
  return(diction[1:10])
}


### SCRIPT PART

languages <- c("danish","dutch","english","finnish","french","german","hungarian","italian",
               "norwegian","portuguese","spanish","swedish")

wordterms <- lapply(1:length(languages), function(x) create_dictionary(lang = languages[x], categories = c(501),100))

dictionary_matrix <- unlist(lapply(1:length(wordterms), function(x) paste(unlist(wordterms[x]), collapse=", ")))

output <- cbind(languages,dictionary_matrix)

latex(output, file="dictionary-table.tex", 
      colheads = c(" ","environment"), col.just = c("l","p{2.3in}"), 
      caption = "Most unique word stems by language and issue domain (category 501: environmental protection)")
